Taek_Blog - Opendata

Data Transformaion

1 오픈데이터 분석 실습 : Data Transformation

1.1 패키지 불러오기

library(tidyverse)
library(nycflights13)

1.2 간단한 데이터셋 파악

nycflights13

### 도움말
#?flights

### 요약
flights %>% str()

tibble [336,776 × 19] (S3: tbl_df/tbl/data.frame)
 $ year          : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
 $ month         : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
 $ day           : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
 $ dep_time      : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
 $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
 $ dep_delay     : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
 $ arr_time      : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
 $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
 $ arr_delay     : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
 $ carrier       : chr [1:336776] "UA" "UA" "AA" "B6" ...
 $ flight        : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
 $ tailnum       : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
 $ origin        : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
 $ dest          : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
 $ air_time      : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
 $ distance      : num [1:336776] 1400 1416 1089 1576 762 ...
 $ hour          : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
 $ minute        : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
 $ time_hour     : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...

### 컬럼 이름
flights %>% colnames()

 [1] "year"           "month"          "day"            "dep_time"      
 [5] "sched_dep_time" "dep_delay"      "arr_time"       "sched_arr_time"
 [9] "arr_delay"      "carrier"        "flight"         "tailnum"       
[13] "origin"         "dest"           "air_time"       "distance"      
[17] "hour"           "minute"         "time_hour"

2 dplyr 기초

2.1 filter

조건 필터링

### table - 빈도수 파악
flights$month %>% table()

.
    1     2     3     4     5     6     7     8     9    10    11    12 
27004 24951 28834 28330 28796 28243 29425 29327 27574 28889 27268 28135

### 월별 필터링
flights %>%
  filter(month==12 | month==11)

# A tibble: 55,403 × 19
    year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
   <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
 1  2013    11     1        5       2359       6     352     345       7 B6     
 2  2013    11     1       35       2250     105     123    2356      87 B6     
 3  2013    11     1      455        500      -5     641     651     -10 US     
 4  2013    11     1      539        545      -6     856     827      29 UA     
 5  2013    11     1      542        545      -3     831     855     -24 AA     
 6  2013    11     1      549        600     -11     912     923     -11 UA     
 7  2013    11     1      550        600     -10     705     659       6 US     
 8  2013    11     1      554        600      -6     659     701      -2 US     
 9  2013    11     1      554        600      -6     826     827      -1 DL     
10  2013    11     1      554        600      -6     749     751      -2 DL     
# … with 55,393 more rows, 9 more variables: flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>, and abbreviated variable names
#   ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay

### 월별 필터링 - %in% 사용
flights %>% 
  filter(month %in% c(11,12))

# A tibble: 55,403 × 19
    year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
   <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
 1  2013    11     1        5       2359       6     352     345       7 B6     
 2  2013    11     1       35       2250     105     123    2356      87 B6     
 3  2013    11     1      455        500      -5     641     651     -10 US     
 4  2013    11     1      539        545      -6     856     827      29 UA     
 5  2013    11     1      542        545      -3     831     855     -24 AA     
 6  2013    11     1      549        600     -11     912     923     -11 UA     
 7  2013    11     1      550        600     -10     705     659       6 US     
 8  2013    11     1      554        600      -6     659     701      -2 US     
 9  2013    11     1      554        600      -6     826     827      -1 DL     
10  2013    11     1      554        600      -6     749     751      -2 DL     
# … with 55,393 more rows, 9 more variables: flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>, and abbreviated variable names
#   ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay

2.2 missing values

결측값 NA

### 결측값은 연산 불가능
NA

[1] NA

NA > 5

[1] NA

10 == NA

[1] NA

NA + 19

[1] NA

NA / 2

[1] NA

### 결측값 개수
flights$dep_time %>%
  is.na() %>% 
  sum()

[1] 8255

### 결측값 있는 df 생성
df <- data.frame(x = c(1, NA, 3))
df

### 타입 확인
df %>% class()

[1] "data.frame"

### tibble : tidyverse의 df 클래스
tibble_df <- tibble(x = c(1, NA, 3))
tibble_df

# A tibble: 3 × 1
      x
  <dbl>
1     1
2    NA
3     3

tibble_df %>% filter(x > 1)

# A tibble: 1 × 1
      x
  <dbl>
1     3

tibble_df %>% filter(is.na(x) | x>1)

# A tibble: 2 × 1
      x
  <dbl>
1    NA
2     3

2.3 arrange

정렬

### 오름차순 정렬
flights %>% arrange(dep_time)

# A tibble: 336,776 × 19
    year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
   <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
 1  2013     1    13        1       2249      72     108    2357      71 B6     
 2  2013     1    31        1       2100     181     124    2225     179 WN     
 3  2013    11    13        1       2359       2     442     440       2 B6     
 4  2013    12    16        1       2359       2     447     437      10 B6     
 5  2013    12    20        1       2359       2     430     440     -10 B6     
 6  2013    12    26        1       2359       2     437     440      -3 B6     
 7  2013    12    30        1       2359       2     441     437       4 B6     
 8  2013     2    11        1       2100     181     111    2225     166 WN     
 9  2013     2    24        1       2245      76     121    2354      87 B6     
10  2013     3     8        1       2355       6     431     440      -9 B6     
# … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>, and abbreviated variable names
#   ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay

### 결측값은 가장 하단 배치
df <- tibble(x = c(5,2,NA))
arrange(df, x)

# A tibble: 3 × 1
      x
  <dbl>
1     2
2     5
3    NA

arrange(df, -x)

# A tibble: 3 × 1
      x
  <dbl>
1     5
2     2
3    NA

2.4 select

컬럼 선택

### 컬럼 선택 (순서 지정 가능)
flights %>% select(year, month, day)

# A tibble: 336,776 × 3
    year month   day
   <int> <int> <int>
 1  2013     1     1
 2  2013     1     1
 3  2013     1     1
 4  2013     1     1
 5  2013     1     1
 6  2013     1     1
 7  2013     1     1
 8  2013     1     1
 9  2013     1     1
10  2013     1     1
# … with 336,766 more rows

flights %>% select(year:day)

# A tibble: 336,776 × 3
    year month   day
   <int> <int> <int>
 1  2013     1     1
 2  2013     1     1
 3  2013     1     1
 4  2013     1     1
 5  2013     1     1
 6  2013     1     1
 7  2013     1     1
 8  2013     1     1
 9  2013     1     1
10  2013     1     1
# … with 336,766 more rows

### 컬럼 제외
flights %>% select(-(year:day))

# A tibble: 336,776 × 16
   dep_t…¹ sched…² dep_d…³ arr_t…⁴ sched…⁵ arr_d…⁶ carrier flight tailnum origin
     <int>   <int>   <dbl>   <int>   <int>   <dbl> <chr>    <int> <chr>   <chr> 
 1     517     515       2     830     819      11 UA        1545 N14228  EWR   
 2     533     529       4     850     830      20 UA        1714 N24211  LGA   
 3     542     540       2     923     850      33 AA        1141 N619AA  JFK   
 4     544     545      -1    1004    1022     -18 B6         725 N804JB  JFK   
 5     554     600      -6     812     837     -25 DL         461 N668DN  LGA   
 6     554     558      -4     740     728      12 UA        1696 N39463  EWR   
 7     555     600      -5     913     854      19 B6         507 N516JB  EWR   
 8     557     600      -3     709     723     -14 EV        5708 N829AS  LGA   
 9     557     600      -3     838     846      -8 B6          79 N593JB  JFK   
10     558     600      -2     753     745       8 AA         301 N3ALAA  LGA   
# … with 336,766 more rows, 6 more variables: dest <chr>, air_time <dbl>,
#   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>, and abbreviated
#   variable names ¹dep_time, ²sched_dep_time, ³dep_delay, ⁴arr_time,
#   ⁵sched_arr_time, ⁶arr_delay

### 시작 문자열 지정
flights %>% select(starts_with('dep'))

# A tibble: 336,776 × 2
   dep_time dep_delay
      <int>     <dbl>
 1      517         2
 2      533         4
 3      542         2
 4      544        -1
 5      554        -6
 6      554        -4
 7      555        -5
 8      557        -3
 9      557        -3
10      558        -2
# … with 336,766 more rows

### 끝 문자열 지정
flights %>% select(ends_with("time"))

# A tibble: 336,776 × 5
   dep_time sched_dep_time arr_time sched_arr_time air_time
      <int>          <int>    <int>          <int>    <dbl>
 1      517            515      830            819      227
 2      533            529      850            830      227
 3      542            540      923            850      160
 4      544            545     1004           1022      183
 5      554            600      812            837      116
 6      554            558      740            728      150
 7      555            600      913            854      158
 8      557            600      709            723       53
 9      557            600      838            846      140
10      558            600      753            745      138
# … with 336,766 more rows

### 포함 문자열 지정
flights %>% select(contains("time"))

# A tibble: 336,776 × 6
   dep_time sched_dep_time arr_time sched_arr_time air_time time_hour          
      <int>          <int>    <int>          <int>    <dbl> <dttm>             
 1      517            515      830            819      227 2013-01-01 05:00:00
 2      533            529      850            830      227 2013-01-01 05:00:00
 3      542            540      923            850      160 2013-01-01 05:00:00
 4      544            545     1004           1022      183 2013-01-01 05:00:00
 5      554            600      812            837      116 2013-01-01 06:00:00
 6      554            558      740            728      150 2013-01-01 05:00:00
 7      555            600      913            854      158 2013-01-01 06:00:00
 8      557            600      709            723       53 2013-01-01 06:00:00
 9      557            600      838            846      140 2013-01-01 06:00:00
10      558            600      753            745      138 2013-01-01 06:00:00
# … with 336,766 more rows

### 컬럼 이름 변경 new = old
flights %>% rename(tail_num = tailnum)

# A tibble: 336,776 × 19
    year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
   <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
 1  2013     1     1      517        515       2     830     819      11 UA     
 2  2013     1     1      533        529       4     850     830      20 UA     
 3  2013     1     1      542        540       2     923     850      33 AA     
 4  2013     1     1      544        545      -1    1004    1022     -18 B6     
 5  2013     1     1      554        600      -6     812     837     -25 DL     
 6  2013     1     1      554        558      -4     740     728      12 UA     
 7  2013     1     1      555        600      -5     913     854      19 B6     
 8  2013     1     1      557        600      -3     709     723     -14 EV     
 9  2013     1     1      557        600      -3     838     846      -8 B6     
10  2013     1     1      558        600      -2     753     745       8 AA     
# … with 336,766 more rows, 9 more variables: flight <int>, tail_num <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>, and abbreviated variable names
#   ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay

### 순서 배치 + 나머지
flights %>% select(time_hour, air_time, everything())

# A tibble: 336,776 × 19
   time_hour           air_t…¹  year month   day dep_t…² sched…³ dep_d…⁴ arr_t…⁵
   <dttm>                <dbl> <int> <int> <int>   <int>   <int>   <dbl>   <int>
 1 2013-01-01 05:00:00     227  2013     1     1     517     515       2     830
 2 2013-01-01 05:00:00     227  2013     1     1     533     529       4     850
 3 2013-01-01 05:00:00     160  2013     1     1     542     540       2     923
 4 2013-01-01 05:00:00     183  2013     1     1     544     545      -1    1004
 5 2013-01-01 06:00:00     116  2013     1     1     554     600      -6     812
 6 2013-01-01 05:00:00     150  2013     1     1     554     558      -4     740
 7 2013-01-01 06:00:00     158  2013     1     1     555     600      -5     913
 8 2013-01-01 06:00:00      53  2013     1     1     557     600      -3     709
 9 2013-01-01 06:00:00     140  2013     1     1     557     600      -3     838
10 2013-01-01 06:00:00     138  2013     1     1     558     600      -2     753
# … with 336,766 more rows, 10 more variables: sched_arr_time <int>,
#   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,
#   dest <chr>, distance <dbl>, hour <dbl>, minute <dbl>, and abbreviated
#   variable names ¹air_time, ²dep_time, ³sched_dep_time, ⁴dep_delay, ⁵arr_time

2.5 mutate

새로운 컬럼 생성

flights %>% mutate(
  gain = dep_delay - arr_delay,
  hours = air_time / 60,
  gain_per_hour = gain / hours
)

# A tibble: 336,776 × 22
    year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
   <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
 1  2013     1     1      517        515       2     830     819      11 UA     
 2  2013     1     1      533        529       4     850     830      20 UA     
 3  2013     1     1      542        540       2     923     850      33 AA     
 4  2013     1     1      544        545      -1    1004    1022     -18 B6     
 5  2013     1     1      554        600      -6     812     837     -25 DL     
 6  2013     1     1      554        558      -4     740     728      12 UA     
 7  2013     1     1      555        600      -5     913     854      19 B6     
 8  2013     1     1      557        600      -3     709     723     -14 EV     
 9  2013     1     1      557        600      -3     838     846      -8 B6     
10  2013     1     1      558        600      -2     753     745       8 AA     
# … with 336,766 more rows, 12 more variables: flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>, gain <dbl>, hours <dbl>,
#   gain_per_hour <dbl>, and abbreviated variable names ¹sched_dep_time,
#   ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay

### transmute : mutate와 다르게 반환된 값만 출력 
flights %>% transmute(
  gain = dep_delay - arr_delay,
  hours = air_time / 60,
  gain_per_hour = gain / hours
)

# A tibble: 336,776 × 3
    gain hours gain_per_hour
   <dbl> <dbl>         <dbl>
 1    -9 3.78          -2.38
 2   -16 3.78          -4.23
 3   -31 2.67         -11.6 
 4    17 3.05           5.57
 5    19 1.93           9.83
 6   -16 2.5           -6.4 
 7   -24 2.63          -9.11
 8    11 0.883         12.5 
 9     5 2.33           2.14
10   -10 2.3           -4.35
# … with 336,766 more rows

2.6 유용한 기능

### 몫
5 %/% 3

[1] 1

### 나머지
5 %% 3

[1] 2

### 그룹화 + 집계값 요약
flights %>% 
  group_by(year, month, day) %>% 
  summarise(mean = mean(dep_delay))

`summarise()` has grouped output by 'year', 'month'. You can override using the
`.groups` argument.

# A tibble: 365 × 4
# Groups:   year, month [12]
    year month   day  mean
   <int> <int> <int> <dbl>
 1  2013     1     1    NA
 2  2013     1     2    NA
 3  2013     1     3    NA
 4  2013     1     4    NA
 5  2013     1     5    NA
 6  2013     1     6    NA
 7  2013     1     7    NA
 8  2013     1     8    NA
 9  2013     1     9    NA
10  2013     1    10    NA
# … with 355 more rows

### 그룹화 + 집계값 요약 + NA 제거
flights %>% 
  group_by(year, month, day) %>% 
  summarise(mean = mean(dep_delay, na.rm = T))

`summarise()` has grouped output by 'year', 'month'. You can override using the
`.groups` argument.

# A tibble: 365 × 4
# Groups:   year, month [12]
    year month   day  mean
   <int> <int> <int> <dbl>
 1  2013     1     1 11.5 
 2  2013     1     2 13.9 
 3  2013     1     3 11.0 
 4  2013     1     4  8.95
 5  2013     1     5  5.73
 6  2013     1     6  7.15
 7  2013     1     7  5.42
 8  2013     1     8  2.55
 9  2013     1     9  2.28
10  2013     1    10  2.84
# … with 355 more rows

### !is.na : 결측값 아닌 값들만 출력
flights %>% 
  filter(!is.na(dep_delay), !is.na(arr_delay))

# A tibble: 327,346 × 19
    year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
   <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
 1  2013     1     1      517        515       2     830     819      11 UA     
 2  2013     1     1      533        529       4     850     830      20 UA     
 3  2013     1     1      542        540       2     923     850      33 AA     
 4  2013     1     1      544        545      -1    1004    1022     -18 B6     
 5  2013     1     1      554        600      -6     812     837     -25 DL     
 6  2013     1     1      554        558      -4     740     728      12 UA     
 7  2013     1     1      555        600      -5     913     854      19 B6     
 8  2013     1     1      557        600      -3     709     723     -14 EV     
 9  2013     1     1      557        600      -3     838     846      -8 B6     
10  2013     1     1      558        600      -2     753     745       8 AA     
# … with 327,336 more rows, 9 more variables: flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>, and abbreviated variable names
#   ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay

### 그룹화 해제 후 집계값 요약
group <- flights %>%
  group_by(year, month, day)

group %>% 
  ungroup() %>% 
  summarise(flights = n())

# A tibble: 1 × 1
  flights
    <int>
1  336776

### drop_na
tibble_df %>% drop_na()

# A tibble: 2 × 1
      x
  <dbl>
1     1
2     3

### mutate 컬럼 위치 지정
flights %>%
  mutate(mean_arr_time = mean(arr_time, na.rm = T), .after = arr_time)

# A tibble: 336,776 × 20
    year month   day dep_time sched_de…¹ dep_d…² arr_t…³ mean_…⁴ sched…⁵ arr_d…⁶
   <int> <int> <int>    <int>      <int>   <dbl>   <int>   <dbl>   <int>   <dbl>
 1  2013     1     1      517        515       2     830   1502.     819      11
 2  2013     1     1      533        529       4     850   1502.     830      20
 3  2013     1     1      542        540       2     923   1502.     850      33
 4  2013     1     1      544        545      -1    1004   1502.    1022     -18
 5  2013     1     1      554        600      -6     812   1502.     837     -25
 6  2013     1     1      554        558      -4     740   1502.     728      12
 7  2013     1     1      555        600      -5     913   1502.     854      19
 8  2013     1     1      557        600      -3     709   1502.     723     -14
 9  2013     1     1      557        600      -3     838   1502.     846      -8
10  2013     1     1      558        600      -2     753   1502.     745       8
# … with 336,766 more rows, 10 more variables: carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>, and abbreviated variable names
#   ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴mean_arr_time, ⁵sched_arr_time,
#   ⁶arr_delay

2.7 rank, slice

행의 범위 지정

### 년/월/일자별 top10 'arr_delay' 출력

### rank 사용
flights %>% 
  group_by(year,month,day) %>% 
  filter(rank(desc(arr_delay)) < 11) %>% 
  select(year,month,day, arr_delay)

# A tibble: 3,691 × 4
# Groups:   year, month, day [365]
    year month   day arr_delay
   <int> <int> <int>     <dbl>
 1  2013     1     1       851
 2  2013     1     1       338
 3  2013     1     1       263
 4  2013     1     1       166
 5  2013     1     1       174
 6  2013     1     1       222
 7  2013     1     1       250
 8  2013     1     1       246
 9  2013     1     1       191
10  2013     1     1       456
# … with 3,681 more rows

### slice 사용
flights %>% 
  group_by(year,month,day) %>% 
  slice_max(arr_delay,n=10) %>% 
  select(year,month,day, arr_delay)

# A tibble: 3,697 × 4
# Groups:   year, month, day [365]
    year month   day arr_delay
   <int> <int> <int>     <dbl>
 1  2013     1     1       851
 2  2013     1     1       456
 3  2013     1     1       338
 4  2013     1     1       263
 5  2013     1     1       250
 6  2013     1     1       246
 7  2013     1     1       222
 8  2013     1     1       191
 9  2013     1     1       174
10  2013     1     1       166
# … with 3,687 more rows

### 순위 책정
flights %>% 
  select(year,month,day,arr_delay) %>% 
  arrange(year,month,day,desc(arr_delay)) %>% 
  group_by(year,month,day) %>% 
  mutate(rank = rank(desc(arr_delay)))

# A tibble: 336,776 × 5
# Groups:   year, month, day [365]
    year month   day arr_delay  rank
   <int> <int> <int>     <dbl> <dbl>
 1  2013     1     1       851     1
 2  2013     1     1       456     2
 3  2013     1     1       338     3
 4  2013     1     1       263     4
 5  2013     1     1       250     5
 6  2013     1     1       246     6
 7  2013     1     1       222     7
 8  2013     1     1       191     8
 9  2013     1     1       174     9
10  2013     1     1       166    10
# … with 336,766 more rows

3 Quiz 1

dep_time의 결측지를 dep_time의 평균 값으로 교체하시오

### NA 제외한 평균값
mean_dep_time <- mean(flights$dep_time,na.rm = T) %>% as.integer()
mean_dep_time

[1] 1349

### repalce_na 
flights$dep_time <- flights$dep_time %>% 
  replace_na(mean_dep_time)

flights$dep_time %>% is.na() %>% sum()

[1] 0

### mutate + ifelse
flights %>% 
  mutate(dep_time = ifelse(is.na(dep_time),
                           mean_dep_time,dep_time))

# A tibble: 336,776 × 19
    year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
   <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
 1  2013     1     1      517        515       2     830     819      11 UA     
 2  2013     1     1      533        529       4     850     830      20 UA     
 3  2013     1     1      542        540       2     923     850      33 AA     
 4  2013     1     1      544        545      -1    1004    1022     -18 B6     
 5  2013     1     1      554        600      -6     812     837     -25 DL     
 6  2013     1     1      554        558      -4     740     728      12 UA     
 7  2013     1     1      555        600      -5     913     854      19 B6     
 8  2013     1     1      557        600      -3     709     723     -14 EV     
 9  2013     1     1      557        600      -3     838     846      -8 B6     
10  2013     1     1      558        600      -2     753     745       8 AA     
# … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>, and abbreviated variable names
#   ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay

4 Quiz 2

1. 월별 비행기 개수를 구하시오
1. dest(도착공항)별로 비행기가 10000대 이상 착륙한 공항으로 도착한 비행정보만 추출

### 1
flights %>%
  group_by(month) %>% 
  summarise(fly_count = n())

# A tibble: 12 × 2
   month fly_count
   <int>     <int>
 1     1     27004
 2     2     24951
 3     3     28834
 4     4     28330
 5     5     28796
 6     6     28243
 7     7     29425
 8     8     29327
 9     9     27574
10    10     28889
11    11     27268
12    12     28135

### 2
air <- flights %>% 
  group_by(dest) %>%
  filter(n() >= 10000)

air %>% nrow()

[1] 131440

table(air$dest)


  ATL   BOS   CLT   FLL   LAX   MCO   MIA   ORD   SFO 
17215 15508 14064 12055 16174 14082 11728 17283 13331

5 Quiz 3

다음은 미국 NBA 농구리그의 농구선수별 게임당 경기통계(stats)이다.

Pos: 농구에서 선수의 포지션
Age: 나이
Tm: 팀이름
3P: 3점슛 성공횟수
3PA: 3점슛 시도횟수
3P%: 3점 성공률
PTS: 평균득점

일 때 다음 물음에 답하시오.

stats <-  read_csv("C:/Users/seong taek/Desktop/3-1 Opendata_Analysis/opendata/nba2021_per_game.csv")

Rows: 497 Columns: 29
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (3): Player, Pos, Tm
dbl (26): Age, G, GS, MP, FG, FGA, FG%, 3P, 3PA, 3P%, 2P, 2PA, 2P%, eFG%, FT...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

stats

# A tibble: 497 × 29
   Player      Pos     Age Tm        G    GS    MP    FG   FGA `FG%`  `3P` `3PA`
   <chr>       <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 Precious A… PF       21 MIA      28     2  14.6   2.6   4.4 0.59    0     0  
 2 Jaylen Ada… PG       24 MIL       6     0   2.8   0.2   1.3 0.125   0     0.3
 3 Steven Ada… C        27 NOP      27    27  28.1   3.5   5.8 0.603   0     0  
 4 Bam Adebayo C        23 MIA      26    26  33.6   7.4  12.9 0.573   0.1   0.2
 5 LaMarcus A… C        35 SAS      18    18  26.7   5.9  12.5 0.476   1.3   3.7
 6 Ty-Shon Al… SG       22 PHO       3     0   2.7   0     1   0       0     0.3
 7 Nickeil Al… SG       22 NOP      23     3  19.2   3.3   8.2 0.41    1     3.8
 8 Grayson Al… SG       25 MEM      19     8  23.9   3.2   7.4 0.429   2.3   5.3
 9 Jarrett Al… C        22 TOT      28    10  26.2   4.4   6.8 0.642   0     0.1
10 Jarrett Al… C        22 BRK      12     5  26.7   3.7   5.4 0.677   0     0  
# … with 487 more rows, and 17 more variables: `3P%` <dbl>, `2P` <dbl>,
#   `2PA` <dbl>, `2P%` <dbl>, `eFG%` <dbl>, FT <dbl>, FTA <dbl>, `FT%` <dbl>,
#   ORB <dbl>, DRB <dbl>, TRB <dbl>, AST <dbl>, STL <dbl>, BLK <dbl>,
#   TOV <dbl>, PF <dbl>, PTS <dbl>

5.1 1. 위의 dataframe을 가지고 group 별 통계값을 계산하시오.

NBA 농구팀별(Tm)로 가장 평균득점(PTS)이 높은 사람과 낮은 사람을 추출하는 코드를 작성하시오.
또한, 팀별로 평균득점 최대값과 최소값의 차이를 gap이라는 컬럼을 새로 만들어 나타내시오.

stats %>% 
  group_by(Tm) %>% 
  summarise(max_score = max(PTS),
         min_score = min(PTS),
         gap = max_score - min_score,
         max_player = Player[which.max(PTS)],
         min_player = Player[which.min(PTS)])

# A tibble: 31 × 6
   Tm    max_score min_score   gap max_player     min_player       
   <chr>     <dbl>     <dbl> <dbl> <chr>          <chr>            
 1 ATL        26.5       1.8  24.7 Trae Young     Bruno Fernando   
 2 BOS        25.9       2.4  23.5 Jaylen Brown   Tremont Waters   
 3 BRK        29         0    29   Kevin Durant   Noah Vonleh      
 4 CHI        28.5       1.5  27   Zach LaVine    Luke Kornet      
 5 CHO        22.3       1    21.3 Gordon Hayward Vernon Carey Jr. 
 6 CLE        22.8       1.2  21.6 Collin Sexton  Marques Bolden   
 7 DAL        29.1       1    28.1 Luka Dončić    Tyrell Terry     
 8 DEN        27.4       0.6  26.8 Nikola Jokić   Vlatko Čančar    
 9 DET        23.8       0    23.8 Jerami Grant   Deividas Sirvydis
10 GSW        30         1.5  28.5 Stephen Curry  Nico Mannion     
# … with 21 more rows

5.2 2. 각 포지션별로(Pos) 평균나이 대비 해당 선수의 나이가 몇배 높거나 낮은지 비율을 계산하시오

예를들어 PG 포지션의 평균나이는 27세이고, Chris Paul의 나이는 35세이므로, 비율은 35/27이 된다.
모든 선수에 대해 이 비율을 age_ratio_by_position이라는 새로운 컬럼으로 저장하는 코드를 작성하시오.
단 , 포지션 별로 비율이 가장 높은 한명만 추출

stats %>% 
  group_by(Pos) %>% 
  mutate(mean_age = mean(Age),
         age_ratio_by_position = Age/mean_age) %>% 
  select(Player, Pos, Age, mean_age, age_ratio_by_position) %>% 
  arrange(-age_ratio_by_position) %>% 
  slice_max(age_ratio_by_position, n=1)

# A tibble: 9 × 5
# Groups:   Pos [9]
  Player          Pos     Age mean_age age_ratio_by_position
  <chr>           <chr> <dbl>    <dbl>                 <dbl>
1 Marc Gasol      C        36     26.1                  1.38
2 Noah Vonleh     F        25     24                    1.04
3 Norvel Pelle    F-C      27     27                    1   
4 Derrick Rose    G        32     27                    1.19
5 Carmelo Anthony PF       36     25.9                  1.39
6 LeBron James    PG       36     25.6                  1.41
7 Andre Iguodala  SF       37     25.4                  1.46
8 Rodions Kurucs  SF-PF    22     22                    1   
9 J.J. Redick     SG       36     25.3                  1.43